Libraries¶

In [1]:
import pandas as pd
import os
import us
from us import states
import plotly.graph_objects as go
import plotly.express as px
import requests
import dotenv

os.chdir('C:\\Users\\nguye\\Documents\\UVA\\Term 3\\Bayesian Machine Learning\\Project\\Data')
os.getcwd()
Out[1]:
'C:\\Users\\nguye\\Documents\\UVA\\Term 3\\Bayesian Machine Learning\\Project\\Data'
In [2]:
dotenv.load_dotenv('.env')
Out[2]:
True
In [3]:
censuskey = os.getenv('censuskey')
In [5]:
censuskey
Out[5]:
'33b860c51f0e598ef29489986b5b936ee248a38b'

Loading Data¶

In [6]:
cdc_lyme = pd.read_csv('LD-Case-Counts-by-County-00-19.csv', encoding = 'latin1')
In [7]:
cdc_lyme
Out[7]:
Ctyname Stname STCODE CTYCODE Cases2000 Cases2001 Cases2002 Cases2003 Cases2004 Cases2005 ... Cases2010 Cases2011 Cases2012 Cases2013 Cases2014 Cases2015 Cases2016 Cases2017 Cases2018 Cases2019
0 Autauga County Alabama 1 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 2 0 0 1
1 Baldwin County Alabama 1 3 1 0 1 0 0 0 ... 0 1 1 0 3 1 2 2 0 0
2 Barbour County Alabama 1 5 0 0 0 0 0 0 ... 0 1 0 0 0 0 0 0 0 1
3 Bibb County Alabama 1 7 0 0 0 0 0 0 ... 0 0 0 0 1 0 0 0 1 0
4 Blount County Alabama 1 9 0 0 0 0 0 0 ... 0 1 0 0 0 0 0 0 2 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3188 Teton County Wyoming 56 39 0 0 1 0 1 1 ... 0 0 2 1 1 0 0 2 0 0
3189 Uinta County Wyoming 56 41 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3190 Washakie County Wyoming 56 43 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3191 Weston County Wyoming 56 45 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3192 Wyoming Wyoming 56 999 0 0 0 0 0 0 ... 0 1 1 1 0 0 0 0 0 3

3193 rows × 24 columns

Data Wrangling¶

Per the CDC, code '999' represents unknown county within a single state so we'll compare the total number cases per county each year and see how much of it represents county.

In [8]:
cdc_lyme[cdc_lyme['CTYCODE'] == '999']
Out[8]:
Ctyname Stname STCODE CTYCODE Cases2000 Cases2001 Cases2002 Cases2003 Cases2004 Cases2005 ... Cases2010 Cases2011 Cases2012 Cases2013 Cases2014 Cases2015 Cases2016 Cases2017 Cases2018 Cases2019

0 rows × 24 columns

In [9]:
cdc_lyme[cdc_lyme['CTYCODE'] != '999']
Out[9]:
Ctyname Stname STCODE CTYCODE Cases2000 Cases2001 Cases2002 Cases2003 Cases2004 Cases2005 ... Cases2010 Cases2011 Cases2012 Cases2013 Cases2014 Cases2015 Cases2016 Cases2017 Cases2018 Cases2019
0 Autauga County Alabama 1 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 2 0 0 1
1 Baldwin County Alabama 1 3 1 0 1 0 0 0 ... 0 1 1 0 3 1 2 2 0 0
2 Barbour County Alabama 1 5 0 0 0 0 0 0 ... 0 1 0 0 0 0 0 0 0 1
3 Bibb County Alabama 1 7 0 0 0 0 0 0 ... 0 0 0 0 1 0 0 0 1 0
4 Blount County Alabama 1 9 0 0 0 0 0 0 ... 0 1 0 0 0 0 0 0 2 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3188 Teton County Wyoming 56 39 0 0 1 0 1 1 ... 0 0 2 1 1 0 0 2 0 0
3189 Uinta County Wyoming 56 41 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3190 Washakie County Wyoming 56 43 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3191 Weston County Wyoming 56 45 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3192 Wyoming Wyoming 56 999 0 0 0 0 0 0 ... 0 1 1 1 0 0 0 0 0 3

3193 rows × 24 columns

In [10]:
cdc_lyme['CTYCODE'] = cdc_lyme['CTYCODE'].astype(str).str.zfill(3)
cdc_lyme['STCODE'] = cdc_lyme['STCODE'].astype(str).str.zfill(2)
cdc_lyme['FIPS'] = cdc_lyme['STCODE'] + cdc_lyme['CTYCODE']
cdc_lyme['stabbr'] = cdc_lyme['STCODE'].map(us.states.mapping('fips', 'abbr'))
state_abbr = cdc_lyme['stabbr'].unique()
In [11]:
state_count_2000 = [cdc_lyme[cdc_lyme['stabbr'] == st]['Cases2000'].sum() for st in state_abbr]
state_count_2001 = [cdc_lyme[cdc_lyme['stabbr'] == st]['Cases2001'].sum() for st in state_abbr]
state_count_2002 = [cdc_lyme[cdc_lyme['stabbr'] == st]['Cases2002'].sum() for st in state_abbr]
state_count_2003 = [cdc_lyme[cdc_lyme['stabbr'] == st]['Cases2003'].sum() for st in state_abbr]
state_count_2004 = [cdc_lyme[cdc_lyme['stabbr'] == st]['Cases2004'].sum() for st in state_abbr]
state_count_2005 = [cdc_lyme[cdc_lyme['stabbr'] == st]['Cases2005'].sum() for st in state_abbr]
state_count_2016 = [cdc_lyme[cdc_lyme['stabbr'] == st]['Cases2006'].sum() for st in state_abbr]
state_count_2007 = [cdc_lyme[cdc_lyme['stabbr'] == st]['Cases2007'].sum() for st in state_abbr]
state_count_2008 = [cdc_lyme[cdc_lyme['stabbr'] == st]['Cases2008'].sum() for st in state_abbr]
state_count_2009 = [cdc_lyme[cdc_lyme['stabbr'] == st]['Cases2009'].sum() for st in state_abbr]
state_count_2010 = [cdc_lyme[cdc_lyme['stabbr'] == st]['Cases2010'].sum() for st in state_abbr]
state_count_2011 = [cdc_lyme[cdc_lyme['stabbr'] == st]['Cases2011'].sum() for st in state_abbr]
state_count_2012 = [cdc_lyme[cdc_lyme['stabbr'] == st]['Cases2012'].sum() for st in state_abbr]
state_count_2013 = [cdc_lyme[cdc_lyme['stabbr'] == st]['Cases2013'].sum() for st in state_abbr]
state_count_2014 = [cdc_lyme[cdc_lyme['stabbr'] == st]['Cases2014'].sum() for st in state_abbr]
state_count_2015 = [cdc_lyme[cdc_lyme['stabbr'] == st]['Cases2015'].sum() for st in state_abbr]
state_count_2016 = [cdc_lyme[cdc_lyme['stabbr'] == st]['Cases2016'].sum() for st in state_abbr]
state_count_2017 = [cdc_lyme[cdc_lyme['stabbr'] == st]['Cases2017'].sum() for st in state_abbr]
state_count_2018 = [cdc_lyme[cdc_lyme['stabbr'] == st]['Cases2018'].sum() for st in state_abbr]
state_count_2019 = [cdc_lyme[cdc_lyme['stabbr'] == st]['Cases2019'].sum() for st in state_abbr]
In [12]:
cases_year = ['Cases2000', 'Cases2001',
       'Cases2002', 'Cases2003', 'Cases2004', 'Cases2005', 'Cases2006',
       'Cases2007', 'Cases2008', 'Cases2009', 'Cases2010', 'Cases2011',
       'Cases2012', 'Cases2013', 'Cases2014', 'Cases2015', 'Cases2016',
       'Cases2017', 'Cases2018', 'Cases2019']
sum = []
In [13]:
count_summary = pd.DataFrame({'State': state_abbr, 
                            '2000': state_count_2000,
                            '2001': state_count_2001,
                            '2002': state_count_2002,
                            '2003': state_count_2003,
                            '2004': state_count_2004,
                            '2005': state_count_2005,
                            '2006': state_count_2016,
                            '2007': state_count_2007,
                            '2008': state_count_2008,
                            '2009': state_count_2009,
                            '2010': state_count_2010,
                            '2011': state_count_2011,
                            '2012': state_count_2012,
                            '2013': state_count_2013,
                            '2014': state_count_2014,
                            '2015': state_count_2015,
                            '2016': state_count_2016,
                            '2017': state_count_2017,
                            '2018': state_count_2018,
                            '2019': state_count_2019})
In [14]:
count_summary
Out[14]:
State 2000 2001 2002 2003 2004 2005 2006 2007 2008 ... 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019
0 AL 6 10 11 8 6 3 38 13 9 ... 2 24 25 24 64 25 38 41 36 66
1 AK 2 2 3 3 3 4 15 10 6 ... 7 11 10 14 8 9 15 10 11 3
2 AZ 2 3 4 4 13 10 13 2 8 ... 2 15 13 32 21 12 13 28 7 10
3 AR 7 4 3 0 0 0 2 1 0 ... 0 0 0 0 0 0 2 6 4 18
4 CA 96 95 97 86 48 95 134 75 74 ... 129 92 70 112 73 98 134 145 104 144
5 CO 0 0 1 0 0 0 0 0 3 ... 3 0 0 0 0 0 0 4 3 8
6 CT 3773 3597 4631 1403 1348 1810 1748 3058 3896 ... 3068 3039 2657 2925 2360 2541 1748 2051 1859 1233
7 DE 167 152 194 212 339 646 506 715 772 ... 656 873 669 509 417 435 506 608 520 641
8 DC 11 17 25 14 16 10 103 116 74 ... 42 0 0 35 40 121 103 84 79 100
9 FL 54 43 79 43 46 47 216 30 88 ... 84 115 118 138 155 166 216 210 169 162
10 GA 0 0 2 10 12 6 4 11 35 ... 10 32 31 8 4 8 4 8 19 18
11 HI 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
12 ID 4 5 4 3 6 2 17 9 9 ... 9 4 5 19 9 9 17 20 9 14
13 IL 35 32 47 71 87 127 237 149 108 ... 135 194 204 337 233 287 237 273 276 395
14 IN 23 26 21 25 32 33 152 55 42 ... 78 94 74 110 110 138 152 142 155 189
15 IA 34 36 42 58 49 89 232 123 109 ... 85 100 165 247 194 318 232 255 283 303
16 KS 17 2 7 4 3 3 39 8 16 ... 10 17 19 34 20 23 39 40 30 35
17 KY 13 23 25 17 15 5 33 6 5 ... 5 3 14 40 44 49 33 20 22 22
18 LA 8 8 5 7 2 3 7 2 3 ... 3 2 7 0 2 3 7 12 4 8
19 ME 71 108 219 175 225 247 1487 529 908 ... 751 1006 1111 1373 1401 1201 1487 1850 1405 2167
20 MD 688 608 738 691 891 1235 1866 2576 2218 ... 1617 1351 1651 1197 1373 1728 1866 1891 1382 1417
21 MA 1158 1164 1807 1532 1532 2336 198 2988 4582 ... 3263 2476 5138 5290 5304 4224 198 410 16 7
22 MI 23 21 26 12 27 62 221 51 92 ... 95 104 98 168 127 148 221 291 262 413
23 MN 465 461 867 474 1023 917 2126 1238 1282 ... 1960 2124 1515 2340 1416 1805 2126 2318 1541 1528
24 MS 3 8 12 21 0 0 1 1 1 ... 0 5 1 0 2 4 1 1 4 4
25 MO 47 37 41 70 25 15 10 10 6 ... 4 8 2 3 10 5 10 12 11 17
26 MT 0 0 0 0 0 0 17 4 17 ... 4 11 6 18 7 5 17 12 7 8
27 NE 5 4 6 2 2 2 14 7 12 ... 8 11 15 10 7 11 14 14 15 10
28 NV 4 4 2 3 1 3 15 15 12 ... 2 5 10 16 6 7 15 17 14 17
29 NH 84 129 261 190 226 265 891 896 1601 ... 1339 1299 1450 1687 724 529 891 1381 1428 1710
30 NJ 2459 2020 2349 2887 2698 3363 4350 3134 3485 ... 3712 4262 3576 3766 3286 4855 4350 5092 4000 3619
31 NM 0 1 1 1 1 3 1 5 8 ... 5 6 1 6 0 0 1 3 2 7
32 NY 4329 4083 5535 5399 5100 5565 3882 4165 7794 ... 3425 4490 2998 4615 3736 4314 3882 5155 3638 4243
33 NC 47 41 137 156 122 49 272 53 47 ... 82 88 122 180 170 230 272 295 212 334
34 ND 2 0 1 0 0 3 32 12 10 ... 33 27 15 29 14 33 32 56 33 38
35 OH 61 44 82 66 50 58 160 33 45 ... 44 53 67 93 119 154 160 270 293 467
36 OK 1 0 0 0 3 0 0 1 2 ... 0 2 4 3 0 0 0 1 0 0
37 OR 13 15 12 16 11 3 61 6 38 ... 39 38 48 43 45 31 61 84 70 65
38 PA 2343 2806 3989 5730 3985 4287 11443 3994 3818 ... 3805 5362 5033 5758 7487 9048 11443 11900 10208 8998
39 RI 675 510 852 736 249 39 903 177 210 ... 181 159 217 724 904 904 903 1132 1111 971
40 SC 25 6 26 18 22 15 51 31 29 ... 29 37 44 42 37 42 51 21 39 47
41 SD 0 0 2 1 1 2 11 0 3 ... 1 4 4 4 2 5 11 12 7 10
42 TN 28 31 28 20 20 8 25 31 31 ... 36 37 30 25 17 26 25 47 29 45
43 TX 77 75 139 85 98 69 71 87 153 ... 142 74 75 82 40 54 71 58 47 41
44 UT 3 1 5 2 1 2 19 7 5 ... 3 9 5 15 13 7 19 25 27 19
45 VT 40 18 37 43 50 54 761 138 404 ... 356 623 522 893 599 710 761 1092 576 1064
46 VA 149 156 259 195 216 274 1350 959 933 ... 1245 1023 1110 1307 1346 1539 1350 1657 1139 1199
47 WA 9 9 11 7 14 13 31 12 23 ... 16 19 15 18 15 24 31 37 20 43
48 WV 35 16 26 31 38 61 368 84 135 ... 145 118 97 143 136 289 368 648 671 885
49 WI 631 597 1090 740 1144 1459 2295 1814 2034 ... 3488 3649 1766 1872 1361 1894 2295 3000 1869 2178
50 WY 3 1 2 2 4 3 1 3 3 ... 0 2 4 3 3 1 1 4 0 5

51 rows × 21 columns

In [15]:
count_summary_melted = pd.melt(count_summary, id_vars = ['State'], value_vars = [str(i) for i in range(2000, 2019)])
count_summary_melted.rename(columns = {'variable': 'Year', 'value': 'Count'}, inplace = True)
In [16]:
count_summary_melted
Out[16]:
State Year Count
0 AL 2000 6
1 AK 2000 2
2 AZ 2000 2
3 AR 2000 7
4 CA 2000 96
... ... ... ...
964 VA 2018 1139
965 WA 2018 20
966 WV 2018 671
967 WI 2018 1869
968 WY 2018 0

969 rows × 3 columns

In [17]:
count_summary_melted['Year'] = pd.to_datetime(count_summary_melted['Year']).dt.year
In [18]:
count_summary_melted
Out[18]:
State Year Count
0 AL 2000 6
1 AK 2000 2
2 AZ 2000 2
3 AR 2000 7
4 CA 2000 96
... ... ... ...
964 VA 2018 1139
965 WA 2018 20
966 WV 2018 671
967 WI 2018 1869
968 WY 2018 0

969 rows × 3 columns

Exploratory Data Analysis¶

In [19]:
fig = px.choropleth(count_summary_melted,
                   locations = count_summary_melted['State'],
                   color = count_summary_melted['Count'],
                   hover_name = count_summary_melted['State'],
                   animation_frame = count_summary_melted['Year'],
                   locationmode = 'USA-states', scope = 'usa',
                   height = 800)
fig.show()
In [20]:
root = 'https://api.census.gov/data'
year = '2000'
dataset = 'dec/sf1'
url = '/'.join([root, year, dataset])
url
Out[20]:
'https://api.census.gov/data/2000/dec/sf1'
In [21]:
predicates = {'get': ['P001001', 'NAME'], 
             'for': 'states:*',
             'key': censuskey}
r = requests.get(url, params = predicates)
In [22]:
predicates
Out[22]:
{'get': ['P001001', 'NAME'],
 'for': 'states:*',
 'key': '33b860c51f0e598ef29489986b5b936ee248a38b'}
In [23]:
r.text
Out[23]:
'error: unknown/unsupported geography heirarchy'